import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['axes.titlesize'] = 24
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['lines.linewidth'] = 3
plt.rcParams['lines.markersize'] = 10
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
import seaborn as sns
# sns.set(style="white")
# sns.set(style="whitegrid", color_codes=True)
plt.style.use('ggplot')
import os
import glob
import warnings
warnings.filterwarnings('ignore')
path = '../../Data/Zendesk/'
df_tickets_agg = pd.DataFrame()
for f in glob.glob(path + "*.xlsx"):
df = pd.read_excel(f)
df_tickets_agg = df_tickets_agg.append(df,ignore_index=True)
df_checkins = (pd.read_csv('../../Data/Metabase/checkins.csv')
.filter(['person_id', 'count checkin'])
.rename(columns={'count checkin':'Count_checkin'})
)
df_plans = pd.read_csv('../../Data/Metabase/plan_values.csv').rename(columns={'max': 'Plan_value'})
df_cancel_1 = pd.read_csv('../../Data/Metabase/cancelamentos_1.csv')
df_cancel_2 = pd.read_csv('../../Data/Metabase/cancelamentos_2.csv')
df_cancel = (pd.concat([df_cancel_1, df_cancel_2])
.filter(['person_id', 'change_to_type']))
# creatin cancelation flag
df_cancel = (df_cancel.groupby('person_id').count()
.reset_index()
.sort_values('change_to_type', ascending=False)
.assign(Cancel=1))
df_agg = pd.merge(df_tickets_agg, df_plans, how='left', left_on='Requester_ID', right_on='person_id')
df_agg = pd.merge(df_agg, df_checkins, how='left', left_on='Requester_ID', right_on='person_id')
df_agg = pd.merge(df_agg, df_cancel, how='left', left_on='Requester_ID', right_on='person_id')
df_agg = df_agg.filter(['Requester_ID', 'Month', 'Volume_7days', 'Volume_15days',
'Volume_Monthly', 'ReclameAqui', 'SocialMedia', 'Tempo_Medio_Chat',
'Tempo_Medio_Email', 'AWT_Chat', '%NFCR', '%Insatisfação(CSAT)',
'CSAT_Rated', 'Plan_value', 'Count_checkin', 'Cancel'])
df_agg.info()
df_agg.head(3)
df_agg.isnull().mean() * 100
import missingno as msno
%matplotlib inline
msno.matrix(df_agg)
# Every Volume Chat, E-mail, Social NaN is zero
df_agg.Tempo_Medio_Chat.loc[df_agg.Tempo_Medio_Chat.isna()] = 0
df_agg.Tempo_Medio_Email.loc[df_agg.Tempo_Medio_Email.isna()] = 0
# Assuming that checkin==NaN is everyone who has median checked in (can be mistake on matebase query)
df_agg['Count_checkin'].loc[df_agg['Count_checkin'].isna()] = df_agg.Count_checkin.median()
# Assuming that cancel is NaN is every one who has not tried to cancel (can be mistake on matebase query)
df_agg.Cancel.loc[df_agg.Cancel.isna()] = 0
df_agg.Plan_value[df_agg.Plan_value.isna()] = df_agg.Plan_value.median()
df_agg.AWT_Chat.loc[df_agg.AWT_Chat.isna()] = 0
df_agg.head(3)
df_agg['Target'] = 0
# motivo de contato pagamento
df_agg.Target.loc[#(df_agg.Volume_7days>3) |
#(df_agg.Volume_15days>3) |
(df_agg.Volume_Monthly>=4) |
(df_agg.ReclameAqui=='yes')
#|(df_agg['%Insatisfação(CSAT)']==1) |
#(df_agg['%NFCR']==1)
] = 1
print(df_agg.Target.value_counts(), '\n')
df_agg.Target.value_counts().plot(kind='bar', title='target')
df = df_agg
# Down-sample majority class
from sklearn.utils import resample
df_majority = df[df.Target == 0]
df_minority = df[df.Target == 1]
df_majority_downsampled = resample(df_majority,
replace=False,
n_samples=df_minority.shape[0],
random_state=42)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled.Target.value_counts()
from pandas_profiling import ProfileReport
prof = ProfileReport(df_downsampled)
prof